!
! Copyright (C) 2001-2009 Quantum ESPRESSO group
! Copyright (C) 2009 Brian Kolb, Timo Thonhauser - Wake Forest University
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
!----------------------------------------------------------------------------
program generate_kernel_
  use mp_global,            ONLY : mp_startup, mp_global_end
  use mp_world,             ONLY : world_comm
  
  implicit none  

  call mp_startup ()
  CALL generate_kernel ( world_comm )
  call mp_global_end( )

END program generate_kernel_

!----------------------------------------------------------------------------
SUBROUTINE generate_kernel ( my_comm )
  !--------------------------------------------------------------------------------------------

  !! This is a stand-alone program to generate the file
  !! "vdW_kernel_table" needed for a van der Waals run.  There should be no
  !! need, in general, to use this program as the default kernel file
  !! supplied with the distribution should suffice for most cases.
  !! However, if that file is insufficient for a particular purpose, a more
  !! suitable kernel file can be generated by running this program.
  
  !! This method is based on the method of Guillermo Roman-Perez and Jose
  !! M. Soler described in:
  
  !!    G. Roman-Perez and J. M. Soler, PRL 103, 096102 (2009)
  
  !! henceforth referred to as SOLER. That method is a new implementation
  !! of the method found in:
  
  !!    M. Dion, H. Rydberg, E. Schroeder, D. C. Langreth, and
  !!    B. I. Lundqvist, Phys. Rev. Lett. 92, 246401 (2004).
  
  !! henceforth referred to as DION. Further information about the
  !! functional and its corresponding potential can be found in:

  !!    T. Thonhauser, V.R. Cooper, S. Li, A. Puzder, P. Hyldgaard,
  !!    and D.C. Langreth, Phys. Rev. B 76, 125112 (2007).

  !! A review article that shows many of the applications vdW-DF has been
  !! applied to so far can be found at:

  !!    D. C. Langreth et al., J. Phys.: Condens. Matter 21, 084203 (2009).
 
 
  !! The original definition of the kernel function is given in DION
  !! equations 13-16.  The Soler method makes the kernel function a
  !! function of only 1 variable (r) by first putting it in the form
  !! phi(q1*r, q2*r).  Then, the q-dependence is removed by expanding the
  !! function in a special way (see SOLER equation 3).  This yields a
  !! separate function for each pair of q points that is a function of r
  !! alone.  There are (N^2+N)/2 unique functions, where N is the number of
  !! q points used.  In the Soler method, the kernel is first made in the
  !! form phi(d1, d2) but this is not done here.  It was found that, with
  !! q's chosen judiciously ahead of time, the kernel and the second
  !! derivatives required for interpolation could be tabulated ahead
  !! of time for faster use of the vdW_FD functional.  This means equations 
  !! 8-10 of SOLER are not used.  There is no nead to soften the kernel and 
  !! correct for this later.  
  
  !! The algorithm employed here is "embarrassingly parallel", meaning that
  !! it parallelizes very well up to (N^2+N)/2 processors, where,
  !! again, N is the number of q points chosen.  However, parallelization
  !! on this scale is unnecessary.  In testing the code runs in under a
  !! minute on 16 Intel Xeon processors.

  !! IMPORTANT NOTICE: results are very sensitive to compilation details.
  !! In particular, the usage of FMA (Fused Multiply-and-Add) instructions
  !! used by modern CPU such as AMD Interlagos (Bulldozer), Intel Ivy Bridge,
  !!  may affect quite heavily some components of the kernel table 
  !! (communication by Ake Sandberg, Umea University). In practice this should
  !! not be a problem, since most affected elements are the less relevant ones.

  !! Some of the algorithms here are somewhat modified versions of those found
  !! in the book: 

  !!    Numerical Recipes in C; William H. Press, Brian P. Flannery, Saul A. 
  !!    Teukolsky, and William T. Vetterling.  Cambridge University Press (1988).

  !! hereafter referred to as NUMERICAL_RECIPES.  The routines were
  !! translated to Fortran, of course and variable names are generally different.


  !! For the calculation of the kernel we have benefited from access to
  !! earlier vdW-DF implementation into PWscf and ABINIT, written by Timo
  !! Thonhauser, Valentino Cooper, and David Langreth. These codes, in turn,
  !! benefited from earlier codes written by Maxime Dion and Henrik
  !! Rydberg.



  
  !! Use some PWSCF modules.  In particular, we need the parallelization modules.
  !! --------------------------------------------------------------------------------------------

  use mp,                   ONLY : mp_get, mp_barrier, mp_size, mp_rank
  use kinds,                ONLY : dp
  use io_global,            ONLY : ionode
  use constants,            ONLY : pi
  
  !! --------------------------------------------------------------------------------------------
  
  implicit none  
  
  integer, intent (in) :: my_comm

  !! These are the user set-able parameters.  
  
  integer, parameter :: Nr_points = 1024         !! The number of radial points (also the number of k points) used in the formation
  !                                              !! of the kernel functions for each pair of q values.  Increasing this value will 
  !                                              !! help in case you get a run-time error saying that you are trying to use a k value
  !                                              !! that is larger than the largest tabulated k point since the largest k point will
  !                                              !! be 2*pi/r_max * Nr_points.  Memory usage of the vdW_DF piece of PWSCF will increase
  !                                              !! roughly linearly with this variable.
  
  real(dp), parameter :: r_max = 100.0D0         !! The value of the maximum radius to use for the real-space kernel functions for each
  !                                              !! pair of q values.  The larger this value is the smaller the smallest k value will be
  !                                              !! since the smallest k point value is 2*pi/r_max.  Be careful though, since this will
  !                                              !! also decrease the maximum k point value and the vdW_DF code will crash if it encounters
  !                                              !! a g-vector with a magnitude greater than 2*pi/r_max *Nr_points
  
  
  !! Integration parameters for the kernel. These are based on DION.
  !! Changing these MAY make the kernel more accurate.  They will not affect the run time or memory 
  !! usage of the vdW-DF code.
  !!-------------------------------------------------------------------------------------------------
  
  integer, parameter :: Nintegration_points = 256          !! Number of integration points for real-space kernel generation (see DION
  !                                                        !! equation 14).  This is how many a's and b's there will be.

  real(dp), parameter :: a_min = 0.0D0                     !! Starting value for the a and b integration in DION equation 14

  real(dp), parameter :: a_max = 64.0D0                    !! Maximum value for the a and b integration in DION equation 14

  !!-------------------------------------------------------------------------------------------------

  CHARACTER(LEN=30) :: double_format = "(1p4e23.14)"


  !! The next 2 parameters define the q mesh to be used in the vdW_DF code.  These are perhaps the most important to have
  !! set correctly.  Increasing the number of q points will DRAMATICALLY increase the memory usage of the vdW_DF code because
  !! the memory consumption depends quadratically on the number of q points in the mesh.
  !! Increasing the number of q points may increase accuracy of the vdW_DF code, although, in testing it was found to have little effect.  
  !! The largest value of the q mesh is q_cut.  All values of q0 (DION equation 11) larger than this value during a run will be saturated 
  !! to this value using equations 6-7 of SOLER.  In testing, increasing the value of q_cut was found to have little impact on the results,
  !! though it is possible that in some systems it may be more important.  Always make sure that the variable Nqs is consistent with 
  !! the number of q points that are actually in the variable q_mesh.  Also, do not set any q value to 0.  This will cause an infinity
  !! in the Fourier transform.
  !! ---------------------------------------------------------------------------------------------------------------------------------------
  !!                                        CHANGE THESE VALUES AT YOUR OWN RISK
  
  integer, parameter :: Nqs = 20

  real(dp), dimension(Nqs):: q_mesh = (/  1.00D-5, 0.0449420825586261D0, 0.0975593700991365D0, &
       0.159162633466142D0, 0.231286496836006D0, 0.315727667369529D0, 0.414589693721418D0, &
       0.530335368404141D0, 0.665848079422965D0, 0.824503639537924D0, 1.010254382520950D0, &
       1.227727621364570D0, 1.482340921174910D0, 1.780437058359530D0, 2.129442028133640D0, &
       2.538050036534580D0, 3.016440085356680D0, 3.576529545442460D0, 4.232271035198720D0, &
       5.0D0 /)
  
  !! ---------------------------------------------------------------------------------------------------------------------------------------
  

  !! The following are a few suggested sets of parameters that may be useful in some systems.  Again, only
  !! change the default values if 1) you know what you're doing and 2) the default values are insufficient
  !! (or suspected to be insufficient) for your particular system.  Use these Sets by commenting out the 
  !! definition of Nqs and q_mesh above and uncommenting 1 of the desired sets below.  You may also make your
  !! own set if you know what you're doing.
  !! --------------------------------------------------------------------------------------------------------------

  !! Uncomment to use a q_mesh of 25 points with a cutoff of 5
  ! +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  
  !integer, parameter :: Nqs = 25
  !real(dp), dimension(Nqs) :: q_mesh = (/ 1.0D-5, 0.0319324863726618D0, 0.0683071727114252D0, &
  !     0.109742023439998D0, 0.156940969402303D0, 0.210705866844455D0, &
  !     0.271950120037604D0, 0.341714198974465D0, 0.421183315767499D0, &
  !     0.511707560050586D0, 0.614824835461683D0, 0.732286986871156D0, &
  !     0.866089562227575D0, 1.01850571464079D0, 1.19212482065999D0, &
  !     1.38989647082725D0, 1.61518057985587D0, 1.87180446774829D0, &
  !     2.16412788159658D0, 2.49711706271187D0, 2.87642911739861D0, &
  !     3.30850812473687D0, 3.80069461413434D0, 4.36135027254676D0, &
  !     5.0D0 /)
  
  ! +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++


  

  !! Uncomment to use a q_mesh of 30 points with a cutoff of 5
  ! +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  
  ! integer, parameter :: Nqs = 30
  ! real(dp), dimension(Nqs) :: q_mesh = (/ 1.0D-5, 0.026559672691443D0, 0.0561185595841672D0, &
  !      0.08901534278204D0, 0.125626949595767D0, 0.166372871329829D0, &
  !      0.211719969762446D0, 0.262187826390619D0, 0.318354695731256D0, &
  !      0.380864130890569D0, 0.4504323573167D0, 0.527856479223139D0, &
  !      0.61402361271113D0, 0.709921050237249D0, 0.816647572889386D0, &
  !      0.935426040085808D0, 1.06761740094853D0, 1.21473628789156D0, &
  !      1.37846837109353D0, 1.56068967270003D0, 1.76348806205544D0, &
  !      1.98918717825406D0, 2.24037305411209D0, 2.51992374661476D0, &
  !      2.83104231334061D0, 3.17729351270267D0, 3.56264464851356D0, &
  !      3.99151102686645D0, 4.46880654617114D0, 5.0D0 &
  !      /)
  
  ! +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++


  
  ! Uncomment to use a q_mesh of 30 poits with a cutoff of 8
  ! +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

  ! integer, parameter :: Nqs = 30
  ! real(dp), dimension(Nqs) :: q_mesh = (/ 1.0D-5, 0.0424954763063088D0, 0.0897896953346675D0, &
  !      0.142424548451264D0, 0.201003119353227D0, 0.266196594127727D0, &
  !      0.338751951619913D0, 0.41950052222499D0, 0.50936751317001D0, &
  !      0.609382609424911D0, 0.720691771706719D0, 0.844570366757022D0, &
  !      0.982437780337809D0, 1.1358736803796D0, 1.30663611662302D0, &
  !      1.49668166413729D0, 1.70818784151764D0, 1.94357806062649D0, &
  !      2.20554939374965D0, 2.49710347632005D0, 2.82158089928871D0, &
  !      3.18269948520649D0, 3.58459688657934D0, 4.03187799458362D0, &
  !      4.52966770134498D0, 5.08366962032427D0, 5.7002314376217D0, &
  !      6.38641764298631D0, 7.15009047387383D0, 8.0D0&
  !      /)

  ! +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++  




  !! ------------------------------------------------------------------------------------------------------------------

  









!!                                DO NOT CHANGE ANYTHING BELOW THIS LINE
!! #########################################################################################################
!! #########################################################################################################
!! #########################################################################################################
!! #########################################################################################################
!!                                DO NOT CHANGE ANYTHING BELOW THIS LINE




  integer :: a_i, b_i, q1_i, q2_i, r_i, count                             !! Indexing variables

  real(dp) :: weights(Nintegration_points)                                  !! Array to hold dx values for the Gaussian-Legendre 
  !                                                                       !! integration of the kernel

  real(dp) :: nu(Nintegration_points), nu1(Nintegration_points)           !! Defined in the discussion below equation 16 of DION

  real(dp) :: a(Nintegration_points), a2(Nintegration_points)             !! The values of the points a (DION equation 14) and a^2

  real(dp) :: sin_a(Nintegration_points), cos_a(Nintegration_points)      !! sine and cosine values of the aforementioned points a

  real(dp) :: W_ab(Nintegration_points, Nintegration_points)              !! Defined in DION equation 16

  real(dp) :: dr, d1, d2, d, w, x, y, z, T, integral                      !! Intermediate values
  
  real(dp) :: gamma = 4.0D0*pi/9.0D0                                      !! Multiplicative factor for exponent in the functions called
  !                                                                       !! "h" in DION

  real(dp), parameter :: small = 1.0D-15                                  !! Number at which to employ special algorithms to avoid numerical
  !                                                                       !! problems.  This is probably not needed but I like to be careful.

  

  !! The following sets up a parallel run.
  !! ------------------------------------------------------------------------------------------------------------------------------------------

  integer :: my_start_q, my_end_q, Ntotal                  !! starting and ending q value for each processor, also the total number of 
  !                                                        !! calculations to do (  (Nqs^2 + Nqs)/2  )

  real(dp), allocatable :: phi(:,:), d2phi_dk2(:,:)        !! Arrays to store the kernel functions and their second derivatives.  They are
  !                                                        !! stored as phi(radial_point, idx)

  integer, allocatable :: indices(:,:), proc_indices(:,:)  !! indices holds the values of q1 and q2 as partitioned out to the processors.  It is an 
  !                                                        !! Ntotal x 2 array stored as indices(index of point number, q1:q2).  
  !                                                        !! Proc_indices holds the section of the indices array that is assigned to each processor.  
  !                                                        !! This is a Nprocs x 2 array, stored as proc_indices(processor number, starting_index:ending_index)

  integer :: nproc, mpime                                  !! number or procs, rank of curret processor
  integer :: Nper, Nextra, start_q, end_q                  !! Baseline number of jobs per processor, number of processors that get an extra job in case the 
  !                                                        !! number of jobs doesn't split evenly over the number of processors, starting index into the
  !                                                        !! indices array, ending index into the indices array.

  integer :: idx, proc_i, kernel_file, my_Nqs


  ! The total number of phi_alpha_beta functions that have to be calculated
  Ntotal = (Nqs**2 + Nqs)/2


  allocate( indices(Ntotal, 2) )

  count = 1

  ! This part fills in the indices array.  It just loops through the q1 and q2 values and stores them.  Sections
  ! of this array will be assigned to each of the processors later.
  ! ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

  do q1_i = 1, Nqs
     do q2_i = 1, q1_i

        indices(count, 1) = q1_i
        indices(count, 2) = q2_i

        count = count + 1

     end do
  end do

  ! ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  
  ! Figure out the baseline number of functions to be calculated by each processor and how many processors get 1 extra job.
  nproc = mp_size(my_comm)
  mpime = mp_rank(my_comm)
  Nper = Ntotal/nproc
  Nextra = mod(Ntotal, nproc)


  allocate(proc_indices(nproc,2) )

  start_q = 0
  end_q = 0

  ! Loop over all the processors and figure out which section of the indices array each processor should do.  All processors
  ! figure this out for every processor so there is no need to communicate results.
  ! ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

  do proc_i = 1, nproc

     start_q = end_q + 1
     end_q = start_q + (Nper - 1)

     if (proc_i <= Nextra) end_q = end_q + 1

     if (proc_i == (mpime+1)) then

        my_start_q = start_q
        my_end_q = end_q

     end if

     proc_indices(proc_i, 1) = start_q
     proc_indices(proc_i, 2) = end_q

  end do

  ! ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

  ! Store how many jobs are assigned to me
  my_Nqs = my_end_q - my_start_q + 1
  
  
  !! ------------------------------------------------------------------------------------------------------------------------------------------


  allocate( phi(0:Nr_points, my_Nqs), d2phi_dk2(0:Nr_points, my_Nqs) )
  
  phi = 0.0D0
  d2phi_dk2 = 0.0D0
  
  dr = (r_max)/(Nr_points)

  !! Find the integration points we are going to use in the Gaussian-Legendre integration
  call prep_gaussian_quadrature(a_min, a_max, a, weights, Nintegration_points)
  
  !! Get a, a^2, sin(a), cos(a) and the weights for the Gaussian-Legendre integration
  !! ------------------------------------------------------------------------------------
  
  do a_i=1, Nintegration_points
     
     a(a_i) = tan(a(a_i))
     a2(a_i) = a(a_i)**2
     weights(a_i) = weights(a_i)*(1+a2(a_i))
     cos_a(a_i) = cos(a(a_i))
     sin_a(a_i) = sin(a(a_i))

  end do

  !! ------------------------------------------------------------------------------------


  !! Calculate the value of the W function defined in DION equation 16 for each value of a and b
  !! ------------------------------------------------------------------------------------

  do a_i = 1, Nintegration_points
     do b_i = 1, Nintegration_points

        W_ab(a_i, b_i) = 2.0D0 * weights(a_i)*weights(b_i) *  (   &
             (3.0D0-a2(a_i))*a(b_i)*cos_a(b_i)*sin_a(a_i)  +   &
             (3.0D0-a2(b_i))*a(a_i)*cos_a(a_i)*sin_a(b_i)  +   &
             (a2(a_i)+a2(b_i)-3.0D0)*sin_a(a_i)*sin_a(b_i) -   &
             3.0D0*a(a_i)*a(b_i)*cos_a(a_i)*cos_a(b_i) )   /   &
             (a(a_i)*a(b_i))

     enddo
  enddo

  !! ------------------------------------------------------------------------------------



  !! Now, we loop over all the pairs q1,q2 that are assigned to us and perform our calculations
  !! -----------------------------------------------------------------------------------------------------

  do idx = 1, my_Nqs

     ! First, get the value of phi(q1*r, q2*r) for each r and the particular values of q1 and q2 we are using
     ! +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

     do r_i = 1, Nr_points

        d1 = q_mesh(indices(idx+my_start_q-1, 1)) * (dr * r_i)
        d2 = q_mesh(indices(idx+my_start_q-1, 2)) * (dr * r_i)

        phi(r_i, idx) = phi_value(d1, d2)
        
     end do

     ! +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

     ! Now, perform a radial FFT to turn our phi_alpha_beta(r) into phi_alpha_beta(k) needed for SOLER 
     ! equation 11
     call radial_fft( phi(:,idx) )
     
     ! Determine the spline interpolation coefficients for the Fourier transformed kernel function
     call set_up_splines( phi(:, idx), d2phi_dk2(:, idx) )
     
  end do
  
  !! -----------------------------------------------------------------------------------------------------
  
  
  !! Finally, we write out the results, after letting everybody catch up
  !! -----------------------------------------------------------------------------------------------------

  call mp_barrier( my_comm )
  call write_kernel_table_file(phi, d2phi_dk2)

  !! -----------------------------------------------------------------------------------------------------

  deallocate( phi, d2phi_dk2, indices, proc_indices )


CONTAINS



  !! ###########################################################################################################
  !!                                           |                  |
  !!                                           |  SET UP SPLINES  |
  !!                                           |__________________|

  !! This subroutine accepts a function (phi) and finds at each point the second derivative (D2) for use with
  !! spline interpolation.  This function assumes we are using the expansion described in SOLER 3 and 4.  That
  !! is, the derivatives are those needed to interpolate Kronecker delta functions at each of the q values
  !! Other than some special modification to speed up the algorithm in our particular case, this algorithm is
  !! taken directly from NUMERICAL_RECIPES pages 96-97.

  subroutine set_up_splines(phi, D2)

    real(dp), intent(in) :: phi(0:Nr_points)          !! The k-space kernel function for a particular q1 and q2

    real(dp), intent(inout) :: D2(0:Nr_points)        !! The second derivatives to be used in the interpolation
    !                                                 !! expansion (SOLER equation 3)

    real(dp), save :: dk = 2.0D0*pi/r_max             !! Spacing of k points

    real(dp), allocatable :: temp_array(:)            !! Temporary storage
    real(dp) :: temp_1, temp_2                        !! 

    allocate( temp_array(0:Nr_points) )

    D2 = 0

    temp_array = 0

    do r_i = 1, Nr_points - 1

       temp_1 = dble(r_i - (r_i - 1))/dble( (r_i + 1) - (r_i - 1) )
       temp_2 = temp_1 * D2(r_i-1) + 2.0D0

       D2(r_i) = (temp_1 - 1.0D0)/temp_2

       temp_array(r_i) = ( phi(r_i+1) - phi(r_i))/dble( dk*((r_i+1) - r_i) ) - &
            ( phi(r_i) - phi(r_i-1))/dble( dk*(r_i - (r_i-1)) )
       temp_array(r_i) = (6.0D0*temp_array(r_i)/dble( dk*((r_i+1) - (r_i-1)) )-temp_1*temp_array(r_i-1))/temp_2

    end do

    D2(Nr_points) = 0.0D0

    do  r_i = Nr_points-1, 0, -1

       D2(r_i) = D2(r_i)*D2(r_i+1) + temp_array(r_i)

    end do

    deallocate( temp_array )

  end subroutine set_up_splines


  !! ###########################################################################################################




  !! ###########################################################################################################
  !!                                          |             |
  !!                                          |  PHI_VALUE  |
  !!                                          |_____________|

  !! This function returns the value of the kernel calculated via DION equation 14.


  real(dp) function phi_value(d1, d2)

    real(dp), intent(in) :: d1, d2                !! The point at which to evaluate the kernel.  Note that
    !                                             !! d1 = q1*r and d2 = q2*r


    phi_value = 0.0D0

    if (d1==0 .and. d2==0) then

       phi_value = 0.0
       return

    end if
    

    !! Loop over all integration points and calculate the value of the nu functions defined in the 
    !! discussion below equation 16 in DION. There are a number of checks here to ensure that we don't
    !! run into numerical problems for very small d values.  They are probably unnecessary but I
    !! wanted to be careful.
    !! ----------------------------------------------------------------------------------------------

    do a_i = 1, Nintegration_points

       if ( a(a_i) <= small .and. d1 > small) then

          nu(a_i) = 9.0D0/8.0D0*d1**2/pi

       else if (d1 <= small) then

          nu(a_i) = a(a_i)**2/2.0D0

       else

          nu(a_i) = a(a_i)**2/((-exp(-(a(a_i)**2*gamma)/d1**2) + 1.0D0)*2.0D0)

       end if


       if ( a(a_i) <= small .and. d2 > small) then

          nu1(a_i) = 9.0D0/8.0D0*d2**2/pi

       else if (d2 < small) then

          nu1(a_i) = a(a_i)**2/2.0D0

       else

          nu1(a_i) = a(a_i)**2/((-exp(-(a(a_i)**2*gamma)/d2**2) + 1.0D0)*2.0D0)

       end if

    end do

    !! ----------------------------------------------------------------------------------------------


    !! Carry out the integration of DION equation 13
    !! ----------------------------------------------------------------------------------------------

    do a_i = 1, Nintegration_points
       do b_i = 1, Nintegration_points

          w = nu(a_i)
          x = nu(b_i)
          y = nu1(a_i)
          z = nu1(b_i)
          
          ! Again, watch out for possible numerical problems
          if (w < small .or. x<small .or. y<small .or. z<small) then

             cycle

          end if

          T = (1.0D0/(w+x) + 1.0D0/(y+z))*(1.0D0/((w+y)*(x+z)) + 1.0D0/((w+z)*(y+x)))

          phi_value = phi_value + T * W_ab(a_i, b_i)

       end do
    end do

    phi_value = 1.0D0/pi**2*phi_value

    !! ----------------------------------------------------------------------------------------------

    return

  end function phi_value


  !! ###########################################################################################################





  !! ###########################################################################################################
  !!                                       |                            |
  !!                                       |  PREP_GAUSSIAN_QUADRATURE  |
  !!                                       |____________________________|
  

  !! Routine to calculate the points and weights for the Gaussian-Legendre integration.  This routine is modeled
  !! after the routine GAULEG from NUMERICAL_RECIPES page 136.
  
  
  subroutine prep_gaussian_quadrature(a_min, a_max, a, weights, Nintegration_points)

    real(dp), intent(in) :: a_min, a_max                       !! The starting and ending points for the integration
    !                                                          !! over a (see DION equation 14.

    real(dp), intent(inout) :: a(:), weights(:)                !! The points and weights for the Gaussian-Legendre
    !                                                          !! integration

    integer, intent(in) :: Nintegration_points                 !! The number of integration points desired
    
    integer :: Npoints                                         !! The number of points we actually have to calculate.
    !                                                          !! The rest will be obtained from symmetry.
    
    real(dp) :: poly_1, poly_2, poly_3                         !! Temporary storage for Legendre Polynomials

    integer :: i_point, i_poly                                 !! Indexing variables 

    real(dp) :: root, dp_dx, last_root                         !! The value of the root of a given Legendre polynomial,
    !                                                          !! The derivative of the polynomial at that root and the value
    !                                                          !! of the root in the last iteration (to check for
    !                                                          !! convergence of Newton's method)
    
    real(dp) :: midpoint, length                               !! The middle of the x-range and the length to that point


    Npoints = (Nintegration_points + 1)/2
    midpoint = 0.5D0 * ( atan(a_min) + atan(a_max) )
    length = 0.5D0 * ( atan(a_max) - atan(a_min) )

    do i_point = 1, Npoints
       

       !! Make an initial guess for the root
       root = cos(dble(pi*(i_point - 0.25D0)/(Nintegration_points + 0.5D0)))

       do 

          !! Use the recurrence relations to find the desired polynomial, evaluated
          !! at the approximate root.  See NUMERICAL_RECIPES page 134
          !! ----------------------------------------------------------------------------------
          
          poly_1 = 1.0D0
          poly_2 = 0.0D0

          do i_poly = 1, Nintegration_points

             poly_3 = poly_2
             poly_2 = poly_1
             poly_1 = ((2.0D0 * i_poly - 1.0D0)*root*poly_2 - (i_poly-1.0D0)*poly_3)/i_poly

          end do

          !! ----------------------------------------------------------------------------------
          

          !! Find the derivative of the polynomial and use it in Newton's method to 
          !! refine our guess for the root.
          !! ----------------------------------------------------------------------------------

          dp_dx = Nintegration_points * (root*poly_1 - poly_2)/(root**2 - 1.0D0)
          
          last_root = root
          root = last_root - poly_1/dp_dx
          
          !! ----------------------------------------------------------------------------------


          !! Check for convergence
          !! ----------------------------------------------------------------------------------

          if (abs(root - last_root) <= 1.0D-14) then

             exit

          end if

          !! ----------------------------------------------------------------------------------

       end do
       
       !! Fill in the array of evaluation points
       !! -------------------------------------------------------------------------------------

       a(i_point) = midpoint - length*root
       a(Nintegration_points + 1 - i_point) = midpoint + length*root

       !! -------------------------------------------------------------------------------------

       
       !! Fill in the array of weights
       !! -------------------------------------------------------------------------------------
       
       weights(i_point) = 2.0D0 * length/((1.0D0 - root**2)*dp_dx**2)
       weights(Nintegration_points + 1 - i_point) = weights(i_point)

       !! -------------------------------------------------------------------------------------

    end do
    
  end subroutine prep_gaussian_quadrature
  
  
  !! ###########################################################################################################
  
  



  !! ###########################################################################################################
  !!                                   |                           |
  !!                                   |  WRITE_KERNEL_TABLE_FILE  |
  !!                                   |___________________________|

  !! Subroutine to write out the vdW_kernel_table file.  All processors pass their data to processor 0 which
  !! is the one that actually does the writing.  This is the only communication in the entire program.

  
  subroutine write_kernel_table_file(phi, d2phi_dk2)
    
    real(dp), target :: phi(:,:), d2phi_dk2(:,:)      !! Each processor passes in its array of kernel values and second
    !                                                 !! derivative values for the q-pairs it calculated.  They are stored
    !                                                 !! as phi(index of function, function_values)

    integer :: proc_Nqs                               !! Number of calculated functions for a particular processor

    real(dp), pointer :: data(:,:)                    !! Pointer to point to the needed section of the phi and d2phi_dk2
    !                                                 !! arrays.  This is needed because some processors may have calculated
    !                                                 !! 1 extra function if the number of processors is not an even divisor
    !                                                 !! of (Nqs^2+Nqs)/2.  Processor 0 is guaranteed to be one of the ones
    !                                                 !! with an extra calculation (if there are any), so it can collect the
    !                                                 !! arrays from other processors and put it in its array.  Data then 
    !                                                 !! points to either the entire array (if the other processor also had
    !                                                 !! an extra calculation), or just the first proc_Nqs entries (which is
    !                                                 !! guaranteed to be at most 1 less than the proc_Nqs for processor 0.


    if (ionode) then
       
       !! Open the file for writing.  The file is written in binary to save space.
       !open(UNIT=21, FILE='vdW_kernel_table', status='replace', form='unformatted', action='write')
       open(UNIT=21, FILE='vdW_kernel_table', status='replace', form='formatted', action='write') 
       !! Write the relevant header information that will be read in by the kernel_table module
       !! ---------------------------------------------------------------------------------------
       !write(*) "Writing headers..."

       write(21, '(2i5,f13.8)') Nqs, Nr_points
       write(21, double_format) r_max
       write(21, double_format) q_mesh
       !! ---------------------------------------------------------------------------------------



       !! Processor 0 writes its kernel functions first.  The subroutine "write_data" is defined
       !! below.
       !! ---------------------------------------------------------------------------------------
       !write(*) "Writing phi proc ", 0
       data => phi(:,:)
       call write_data(21, data)
 
       !! ---------------------------------------------------------------------------------------
       
    end if
    
    !! Now, loop over all other processors (if any) and collect their kernel functions in the phi
    !! array of processor 0, which is big enough to hold any of them.  Figure out how many functions
    !! should have been passed and make data point to just the right amount of the phi array.  Then
    !! write the data.
    !! -------------------------------------------------------------------------------------------

    do proc_i = 1, nproc-1
       
       call mp_get(phi, phi, mpime, 0, proc_i, 0, my_comm)
       
       if (ionode) then
          
          proc_Nqs = proc_indices(proc_i+1, 2) - proc_indices(proc_i+1,1) + 1
          
          !write(*) "Writing phi proc ", proc_i
          data => phi(:,1:proc_Nqs)
          call write_data(21, data)
          
       end if
       
    end do
    
    !! -------------------------------------------------------------------------------------------


    !! Here, we basically repeat the process exactly but for the second derivatives d2phi_dk2 
    !! instead of the kernel itself
    !! -------------------------------------------------------------------------------------------
    
    if (ionode) then
       
       !write(*) "Writing d2phi_dk2 proc ", 0
       data => d2phi_dk2(:,:)
       call write_data(21, data)

    end if
    
    
    do proc_i = 1, nproc-1
       
       call mp_get(d2phi_dk2, d2phi_dk2, mpime, 0, proc_i, 0, my_comm)
       
       if (mpime == 0) then
          
          proc_Nqs = proc_indices(proc_i+1,2) - proc_indices(proc_i+1,1) + 1
          
          !write(*) "Writing d2phi_dk2 proc ", proc_i 
          data => d2phi_dk2(:, 1:proc_Nqs)
          call write_data(21, data)
          
       end if
       
    end do
    
    !! -------------------------------------------------------------------------------------------

    if (ionode) then
       
       close(21)
       
    end if
    
  end subroutine write_kernel_table_file
  

  !! ###########################################################################################################







  !! ###########################################################################################################
  !!                                      |              |
  !!                                      |  WRITE_DATA  |
  !!                                      !______________|

  !! Write matrix data held in the point "array" to the file with unit number "file".  Data is written
  !! in binary format.

  subroutine write_data(file, array)
    
    real(dp), pointer:: array(:,:)     !! Input pointer to the matrix data to be written

    integer, intent(in) :: file                    !! Unit number of file to write to

    integer :: idx, ios                              !! Indexing variable
    

    do idx = 1, size(array,2)
       
       ! write(file) array(:,idx)
       write (file, double_format, err=100, iostat=ios) array(:,idx)    
    
    end do

  100 call errore ('generate_vdW_kernel_table', 'Writing table file', abs (ios) )
  
  end subroutine write_data
  

  !! ###########################################################################################################







  !! ###########################################################################################################
  !!                                            |              |
  !!                                            |  RADIAL_FFT  |
  !!                                            |______________|   

  !! This subroutine performs a radial Fourier transform on the real-space kernel functions.  Basically, this is
  !! just int( 4*pi*r^2*phi*sin(k*r)/(k*r))dr integrated from 0 to r_max.  That is, it is the kernel function phi
  !! integrated with the 0^th spherical Bessel function radially, with a 4*pi assumed from angular integration
  !! since we have spherical symmetry.  The spherical symmetry comes in because the kernel function depends only
  !! on the magnitude of the vector between two points.  The integration is done using the trapezoid rule.


  subroutine radial_fft(phi)

    real(dp), intent(inout) :: phi(0:Nr_points)      !! On input holds the real-space function phi_q1_q2(r)
    !                                                !! On output hold the reciprocal-space function phi_q1_q2(k)

    real(dp) :: phi_k(0:Nr_points)                   !! Temporary storage for phi_q1_q2(k)

    real(dp) :: dr = r_max/Nr_points                 !! Spacing between real-space sample points

    real(dp) :: dk = 2.0D0*pi/r_max                  !! Spacing between reciprocal space sample points

    integer :: k_i, r_i                              !! Indexing variables

    real(dp) :: r, k                                 !! The real and reciprocal space points


    phi_k = 0.0D0

    !! Handle the k=0 point separately
    !! -------------------------------------------------------------------------------------------------
    
    do r_i = 1, Nr_points

       r = r_i * dr

       phi_k(0) = phi_k(0) + phi(r_i)*r**2

    end do
    
    !! Subtract half of the last value of because of the trapezoid rule
    phi_k(0) = phi_k(0) - 0.5D0 * (Nr_points*dr)**2 * phi(Nr_points)

    !! -------------------------------------------------------------------------------------------------


    !! Integration for the rest of the k-points
    !! -------------------------------------------------------------------------------------------------
    
    do k_i = 1, Nr_points

       k = k_i * dk

       do r_i = 1, Nr_points

          r = r_i * dr

          phi_k(k_i) = phi_k(k_i) + phi(r_i) * r * sin(k*r) / k

       end do

       phi_k(Nr_points) = phi_k(Nr_points) - 0.5D0 * phi(Nr_points) * r *sin(k*r) / k

    end do
    
    !! Add in the 4*pi and the dr factor for the integration
    phi = 4.0D0 * pi * phi_k * dr

    !! -------------------------------------------------------------------------------------------------


  end subroutine radial_fft


  !! ###########################################################################################################



end SUBROUTINE generate_kernel
